Homework 3 by Smutin Daniil

Prepare data

df <- read_xlsx("Sleepy lizard.xlsx")

hem <- df %>% 
  dplyr::select("Tot_WBC", "Het_ABS", "Lym_ABS", 
                "H:L Ratio", "Mon_ABS", "OthG_ABS")

env <- df[,4:7]

env$Treatment <- c("normal", "modified")[env$Treatment %>% match(1:2)]

env$Habitat <- c("normal", "swale plantings", "fields with crops", "under fallow")[env$Habitat %>% match(1:4)]

env$`Landscape ID` <- c("LS1", "LS2", "LS3"
)[env$`Landscape ID` %>% match(unique(env$`Landscape ID`))]

Normalizing data

hem %>% 
  as.data.frame %>% 
  pivot_longer(cols = 1:6) %>% 
  ggplot(aes(name, value)) +
  geom_boxplot(outliers = F) +
  geom_jitter(size = .1) +
  theme_minimal()

OK, standardize data and center it

center <- function(x) (x - mean(x)) #/ sd(x)
log10_1 <- function(x) log10(x+1)

hem <- hem %>% 
  apply(2, log10_1) %>% 
  #apply(2, scale) %>% 
  apply(1, center) %>% 
  apply(1, center) %>% 
  as.data.frame()

hem %>% 
  pivot_longer(cols = 1:6) %>% 
  ggplot(aes(y = value)) +
  geom_boxplot(outliers = F, aes(x = 0), width = .5) +
  geom_density() +
  facet_grid(~name, scales = "free") +
  theme_minimal()

Not totally normal distributions, but near them

Removing known batch using LBSI

batch <- df$LBSI %>% center

## measuring its influence
model_batch <- glm(
  batch ~
    hem$Tot_WBC + hem$Het_ABS + hem$Lym_ABS + 
    hem$`H:L Ratio` + hem$Mon_ABS + hem$OthG_ABS
)

plot(model_batch)

14, 81 and 85 ID is marked as outliers. Check it later

Viz

Full:

hem %>% 
  cbind(env) %>% 
  cbind(batch) %>% 
  pivot_longer(cols = 1:6) %>% 
  mutate(value = value) %>% 
  ggplot(mapping = aes("", value)) +
  geom_boxplot(aes(fill = Treatment), outliers = F) +
  geom_jitter(aes(color = batch), size = .2, alpha = .5) +
  theme_minimal() +
  facet_wrap(~name, scales = "free") +
  scale_color_viridis_c()

3 out of 6 categories seems to differ. In contrast, without performing double-centralization, 3-4 categories varied.

Is batch equal?

env %>% 
  cbind(batch) %>% 
  ggplot(mapping = aes(Treatment, batch)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  theme_minimal()

OK, the batch is nearly equal, but can be reduced. I choose MANOVA approach for predict. First of all, let’s remove outliers

PCoA

df_pcoa <- hem %>% 
  prcomp()
plot(df_pcoa)

OK, not bad distribution

gg <- df_pcoa[["x"]] %>% 
  cbind(env) %>% 
  mutate (LID = as.character(`Landscape ID`)) %>% 
  mutate(HBT = as.character(Habitat)) %>% 
  ggplot(aes(PC1, PC2, color = LID, shape = HBT, text = paste0("ID: ", 1:122))) +
  facet_grid(~Treatment) +
  geom_point() +
  theme_minimal()

ggplotly(gg)

Without logarithmization (with usage of only the centering approach), the variance of two groups become unequal

gg <- df_pcoa[["x"]] %>% 
  cbind(env) %>% 
  cbind(batch) %>% 
  mutate (LID = as.character(`Landscape ID`)) %>% 
  mutate(HBT = as.character(Habitat)) %>% 
  ggplot(aes(PC1, PC2, color = batch, shape = Treatment, text = paste0("ID: ", 1:122))) +
  #facet_grid(~Treatment) +
  scale_color_viridis_c(name = "LBSI") +
  geom_point(size = 1, alpha = .7) +
  theme_minimal()

ggplotly(gg)

OK. Seems like lizards from unmodified areas have a bigger spread without row-wise standartization. With this procedure, nothing is related to their groupping. Also, there is ~no groupping or gradient according to LBSI

Group variance

df_PCO <-dfF[,1:6] %>% 
  vegdist(method  = "euclidean") %>% 
  betadisper(dfF$Treatment)
plot(df_PCO)

anova(df_PCO)
## Analysis of Variance Table
## 
## Response: Distances
##            Df Sum Sq Mean Sq F value Pr(>F)
## Groups      1  0.395 0.39473  0.9336 0.3359
## Residuals 120 50.733 0.42278

Groups residuals are equal now (with scaling with different datawizard package functions it was not true)

perMANOVA

df_adonis <- adonis2(hem ~ dfF$Treatment*batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = hem ~ dfF$Treatment * batch, method = "euclidean")
##                      Df SumOfSqs      R2       F Pr(>F)    
## dfF$Treatment         1   35.458 0.24279 38.2035  0.001 ***
## batch                 1    0.521 0.00357  0.5614  0.583    
## dfF$Treatment:batch   1    0.543 0.00372  0.5846  0.545    
## Residual            118  109.519 0.74992                   
## Total               121  146.040 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The difference is significant. But what what exactly is the reason of the differences?

GLM

dfF$target <- as.factor(dfF$Treatment)
df_glm <- glm(data = dfF,
              formula = target ~
                (Tot_WBC + Het_ABS + Lym_ABS + 
                `H:L Ratio` + Mon_ABS + OthG_ABS)*batch,
              family = "binomial")
## Warning: glm.fit: возникли подогнанные вероятности 0 или 1
summary(df_glm)
## 
## Call:
## glm(formula = target ~ (Tot_WBC + Het_ABS + Lym_ABS + `H:L Ratio` + 
##     Mon_ABS + OthG_ABS) * batch, family = "binomial", data = dfF)
## 
## Coefficients: (2 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -6.722      2.019  -3.330 0.000868 ***
## Tot_WBC            -29.958     15.847  -1.891 0.058689 .  
## Het_ABS             -1.510     10.301  -0.147 0.883437    
## Lym_ABS            -12.282      4.657  -2.637 0.008352 ** 
## `H:L Ratio`        -10.392      3.351  -3.101 0.001928 ** 
## Mon_ABS             -9.322      2.733  -3.412 0.000646 ***
## OthG_ABS                NA         NA      NA       NA    
## batch               -8.561     27.148  -0.315 0.752502    
## Tot_WBC:batch      217.299    365.399   0.595 0.552050    
## Het_ABS:batch     -238.897    242.847  -0.984 0.325247    
## Lym_ABS:batch       -9.291    121.280  -0.077 0.938936    
## `H:L Ratio`:batch   49.846     63.896   0.780 0.435320    
## Mon_ABS:batch       65.145     68.589   0.950 0.342220    
## OthG_ABS:batch          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136.100  on 121  degrees of freedom
## Residual deviance:  23.312  on 110  degrees of freedom
## AIC: 47.312
## 
## Number of Fisher Scoring iterations: 9

IDK why is it broken on OthG var

Habitats

Group variance

dfH <- dfF %>% subset(Treatment == 'modified')

df_PCO <- dfH[,1:6] %>% 
  vegdist(method  = "euclidean") %>% 
  betadisper(dfH$Habitat)
plot(df_PCO)

anova(df_PCO)
## Analysis of Variance Table
## 
## Response: Distances
##           Df Sum Sq Mean Sq F value Pr(>F)
## Groups     2  0.315 0.15738  0.3125 0.7324
## Residuals 89 44.828 0.50369

Groups residuals are equal now (with scaling with different datawizard package functions it was not true)

perMANOVA: habitats

df_adonis <- adonis2(dfH[,1:6] ~ dfH$Habitat*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dfH[, 1:6] ~ dfH$Habitat * dfH$batch, method = "euclidean")
##                       Df SumOfSqs      R2      F Pr(>F)
## dfH$Habitat            2    2.650 0.02825 1.2819  0.261
## dfH$batch              1    0.745 0.00794 0.7209  0.479
## dfH$Habitat:dfH$batch  2    1.514 0.01614 0.7322  0.560
## Residual              86   88.900 0.94767              
## Total                 91   93.810 1.00000

The difference is not significant across different habitats.

perMANOVA: connectivity

df_adonis <- adonis2(dfH[,1:6] ~ dfH$Connectivity*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dfH[, 1:6] ~ dfH$Connectivity * dfH$batch, method = "euclidean")
##                            Df SumOfSqs      R2      F Pr(>F)
## dfH$Connectivity            1    0.992 0.01058 0.9538  0.420
## dfH$batch                   1    0.923 0.00984 0.8869  0.410
## dfH$Connectivity:dfH$batch  1    0.341 0.00363 0.3274  0.728
## Residual                   88   91.554 0.97595              
## Total                      91   93.810 1.00000

The difference is also not significant

perMANOVA: connectivity and habitat

df_adonis <- adonis2(dfH[,1:6] ~ dfH$Connectivity*dfH$Habitat*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dfH[, 1:6] ~ dfH$Connectivity * dfH$Habitat * dfH$batch, method = "euclidean")
##                                        Df SumOfSqs      R2      F Pr(>F)
## dfH$Connectivity                        1    0.992 0.01058 0.9402  0.368
## dfH$Habitat                             2    2.609 0.02781 1.2358  0.299
## dfH$batch                               1    0.861 0.00918 0.8162  0.427
## dfH$Connectivity:dfH$Habitat            2    0.959 0.01022 0.4542  0.789
## dfH$Connectivity:dfH$batch              1    0.241 0.00257 0.2287  0.825
## dfH$Habitat:dfH$batch                   2    1.422 0.01516 0.6738  0.634
## dfH$Connectivity:dfH$Habitat:dfH$batch  2    2.290 0.02441 1.0847  0.369
## Residual                               80   84.435 0.90007              
## Total                                  91   93.810 1.00000

Also, no visible relations. It might be better if I somehow balance variables, but not in now.

GLM?

dfH$target <- as.factor(paste(dfH$Habitat, dfH$Connectivity))
df_glm <- glm(data = dfH,
              formula = target ~
                (Tot_WBC + Het_ABS + Lym_ABS + 
                `H:L Ratio` + Mon_ABS + OthG_ABS),
              family = "binomial")

summary(df_glm)
## 
## Call:
## glm(formula = target ~ (Tot_WBC + Het_ABS + Lym_ABS + `H:L Ratio` + 
##     Mon_ABS + OthG_ABS), family = "binomial", data = dfH)
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    4.468      1.414   3.159  0.00159 **
## Tot_WBC       52.135     26.344   1.979  0.04782 * 
## Het_ABS      -34.319     16.411  -2.091  0.03650 * 
## Lym_ABS      -13.039      8.389  -1.554  0.12011   
## `H:L Ratio`    6.336      3.463   1.830  0.06729 . 
## Mon_ABS       -1.675      2.436  -0.687  0.49178   
## OthG_ABS          NA         NA      NA       NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.360  on 91  degrees of freedom
## Residual deviance: 35.191  on 86  degrees of freedom
## AIC: 47.191
## 
## Number of Fisher Scoring iterations: 7
plot(df_glm)

Several outliers could be removed to improve both GLM and perMANOVA

dfH <- dfH[!(rownames(dfH) %in% c(70, 91, 96, 119)),]

df_glm <- glm(data = dfH,
              formula = target ~
                (Tot_WBC + Het_ABS + Lym_ABS + 
                `H:L Ratio` + Mon_ABS + OthG_ABS),
              family = "binomial")
## Warning: glm.fit: алгоритм не сошелся
## Warning: glm.fit: возникли подогнанные вероятности 0 или 1
plot(df_glm)

## Warning in sqrt(crit * p * (1 - hh)/hh): созданы NaN
## Warning in sqrt(crit * p * (1 - hh)/hh): созданы NaN

df_adonis <- adonis2(dfH[,1:6] ~ dfH$Connectivity*dfH$Habitat*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dfH[, 1:6] ~ dfH$Connectivity * dfH$Habitat * dfH$batch, method = "euclidean")
##                                        Df SumOfSqs      R2      F Pr(>F)
## dfH$Connectivity                        1    0.956 0.01027 0.8700  0.417
## dfH$Habitat                             2    2.703 0.02904 1.2297  0.324
## dfH$batch                               1    0.935 0.01004 0.8506  0.422
## dfH$Connectivity:dfH$Habitat            2    0.834 0.00896 0.3795  0.837
## dfH$Connectivity:dfH$batch              1    0.206 0.00221 0.1870  0.848
## dfH$Habitat:dfH$batch                   2    1.632 0.01754 0.7425  0.577
## dfH$Connectivity:dfH$Habitat:dfH$batch  2    2.270 0.02439 1.0327  0.375
## Residual                               76   83.529 0.89754              
## Total                                  87   93.065 1.00000

Still, no connection using AOV, so GLM might have a noisy features for the prediction.

Answers

Task 1

  1. Yes
  2. No
  3. No ## Task 2
  4. No